rm(list = ls())

library(data.table)
library(randomizr)
library(sandwich)
library(xtable)

data_pooled <- readRDS('./replication_hasz/output/data/data_pooled.rds')
data <- data_pooled[study!='Pooled']
data[independent==1, strong_partisan:=NA]

# refine race variable
data[,black:=ifelse(race=='Black or African American',1,0)]
data[,asian:=ifelse(race=='Asian',1,0)]
data[,race_other:=ifelse(!(race %in% c('White', 'Black or African American', 'Asian')),1,0)]

#### balance table by treatment condition
# study 1
# Permutation test of covariate balance
# heteroskedasticity-robust Wald statistic for the hypothesis that all the coefficients on the covariates are zero
# Null hypothesis is that the slope coefficients are all zero, i.e. R beta=0
# t1 vs. t2, t1 vs. t3

fit.t1.t2 <- lm(t2 ~ male + white + black + asian + college +
                  latino + democrat + strong_partisan +
                  work + w_partner + native_born +
                  att_mig + interest_politics,
                data=data[study=='Florida Study' & treatment<3], singular.ok = FALSE)

Rbeta.hat <- coef(fit.t1.t2)[-1]
RVR <- sandwich::vcovHC(fit.t1.t2, type <- 'HC0')[-1,-1]
W_obs <- as.numeric(Rbeta.hat %*% solve(RVR, Rbeta.hat)) # Wooldridge, equation (4.13)

set.seed(1582)
sims <- 10000
N <- nrow(data[study=='Florida Study' & treatment<3])
N.treated <- data[study=='Florida Study' & treatment<3, sum(t2)]
W_sims <- numeric(sims)

for(i in 1:sims){
  treatment.sim <- randomizr::complete_ra(N, N.treated)
  data_sim <- cbind(data[study=='Florida Study' & treatment<3],treatment.sim)
  fit_sim <- lm(treatment.sim ~ male + white + black + asian + college +
                  latino + democrat +  strong_partisan +
                  work + w_partner + native_born +
                  att_mig + interest_politics, data=data_sim, singular.ok = FALSE)
  Rbeta.hat <- coef(fit_sim)[-1]
  RVR <- sandwich::vcovHC(fit_sim, type <- 'HC0')[-1,-1]
  W_sims[i] <- as.numeric(Rbeta.hat %*% solve(RVR, Rbeta.hat))
}

# p-value
p <- mean(W_sims >= W_obs)
# Balance table
summary_stats <- data[study=='Florida Study' & treatment<3, .(lapply(.SD, mean, na.rm=TRUE)), by=t2,
                      .SDcols=c('male' , 'white' , 'black', 'asian', 'college' ,
                                'latino' , 'democrat' , 'strong_partisan' ,
                                'work' , 'w_partner' , 'native_born' ,
                                'att_mig' , 'interest_politics')]

vars <- c('male' , 'white' , 'black', 'asian', 'college' ,
          'latino' , 'democrat', 'strong_partisan' ,
          'work' , 'w_partner' , 'native_born' ,
          'att_mig' , 'interest_politics')
summary_stats <- cbind(summary_stats, rep(vars,2))
names(summary_stats) <- c('treatment', 'mean', 'var')
summary_stats$mean <-  unlist(summary_stats$mean)
summary_stats <- dcast(summary_stats[,.(var, treatment, mean)], var~treatment, value.var=c('mean'))

summary_stats <- summary_stats[vars, on="var"]

p_value <- c()
difference <- c()
for (i in vars) {
  vartest <- var.test(get(i) ~ t2, data = data[study=='Florida Study' & treatment<3])
  if (vartest$p.value > 0.1) {
    ttest <- t.test(get(i) ~ t2, data = data[study=='Florida Study' & treatment<3], var.equal = TRUE)
    
  } else {
    ttest <- t.test(get(i) ~ t2, data = data[study=='Florida Study' & treatment<3])
  }
  difference <- c(difference, unname(ttest$estimate[2] - ttest$estimate[1]))
  p_value <- c(p_value, ttest$p.value)
}

summary_stats <- cbind(summary_stats, difference, p_value)
summary_stats <- rbind(summary_stats,
                       t(c('N', data[study=='Florida Study' & t1==1,.N], data[study=='Florida Study' & t2==1,.N], '', '')),
                       t(c('Wald statistic','', '','',p)),
                       use.names=FALSE)
summary_stats.t1.t2 <- summary_stats[,lapply(.SD, as.numeric),by=var]

# t1 vs t3
fit.t1.t3 <- lm(t3 ~ male + white + black + asian + college +
                  latino + democrat + strong_partisan +
                  work + w_partner + native_born +
                  att_mig + interest_politics,
                data=data[study=='Florida Study' & treatment!=2], singular.ok = FALSE)

Rbeta.hat <- coef(fit.t1.t3)[-1]
RVR <- sandwich::vcovHC(fit.t1.t3, type <- 'HC0')[-1,-1]
W_obs <- as.numeric(Rbeta.hat %*% solve(RVR, Rbeta.hat)) # Wooldridge, equation (4.13)

set.seed(1582)
sims <- 10000
N <- nrow(data[study=='Florida Study' & treatment!=2])
N.treated <- data[study=='Florida Study' & treatment!=2, sum(t3)]
W_sims <- numeric(sims)

for(i in 1:sims){
  treatment.sim <- randomizr::complete_ra(N, N.treated)
  data_sim <- cbind(data[study=='Florida Study' & treatment!=2],treatment.sim)
  fit_sim <- lm(treatment.sim ~ male + white + black + asian + college +
                  latino + democrat + strong_partisan +
                  work + w_partner + native_born +
                  att_mig + interest_politics, data=data_sim, singular.ok = FALSE)
  Rbeta.hat <- coef(fit_sim)[-1]
  RVR <- sandwich::vcovHC(fit_sim, type <- 'HC0')[-1,-1]
  W_sims[i] <- as.numeric(Rbeta.hat %*% solve(RVR, Rbeta.hat))
}

# p-value
p <- mean(W_sims >= W_obs)
# Balance table
summary_stats <- data[study=='Florida Study' & treatment!=2, .(lapply(.SD, mean, na.rm=TRUE)), by=t3,
                      .SDcols=c('male' , 'white' , 'black', 'asian', 'college' ,
                                'latino' , 'democrat'  , 'strong_partisan' ,
                                'work' , 'w_partner' , 'native_born' ,
                                'att_mig' , 'interest_politics')]

vars <- c('male' , 'white' , 'black', 'asian', 'college' ,
          'latino' , 'democrat' ,  'strong_partisan' ,
          'work' , 'w_partner' , 'native_born' ,
          'att_mig' , 'interest_politics')
summary_stats <- cbind(summary_stats, rep(vars,2))
names(summary_stats) <- c('treatment', 'mean', 'var')
summary_stats$mean <-  unlist(summary_stats$mean)
summary_stats <- dcast(summary_stats[,.(var, treatment, mean)], var~treatment, value.var=c('mean'))

summary_stats <- summary_stats[vars, on="var"]

p_value <- c()
difference <- c()
for (i in vars) {
  vartest <- var.test(get(i) ~ t3, data = data[study=='Florida Study' & treatment!=2])
  if (vartest$p.value > 0.1) {
    ttest <- t.test(get(i) ~ t3, data = data[study=='Florida Study' & treatment!=2], var.equal = TRUE)
    
  } else {
    ttest <- t.test(get(i) ~ t3, data = data[study=='Florida Study' & treatment!=2])
  }
  difference <- c(difference, unname(ttest$estimate[2] - ttest$estimate[1]))
  p_value <- c(p_value, ttest$p.value)
}

summary_stats <- cbind(summary_stats, difference, p_value)
summary_stats <- rbind(summary_stats,
                       t(c('N', data[study=='Florida Study' & t1==1,.N], data[study=='Florida Study' & t3==1,.N], '', '')),
                       t(c('Wald statistic','', '','',p)),
                       use.names=FALSE)
summary_stats.t1.t3 <- summary_stats[,lapply(.SD, as.numeric),by=var]
setnames(summary_stats.t1.t3, c('1', 'difference', 'p_value'), c('2', 'difference 2', 'p_value 2'))

summary_stats <- cbind(summary_stats.t1.t2, summary_stats.t1.t3[,c('var', '0'):=.(NULL, NULL)])
setcolorder(summary_stats, c('var', '0', '1', '2', 'difference', 'p_value', 'difference 2', 'p_value 2'))
summary_stats_study1 <- summary_stats

# study 2
fit.t1.t2 <- lm(t2 ~ male + white + black + asian + college +
                  latino + democrat +  strong_partisan +
                  work + w_partner + native_born +
                  att_mig + interest_politics,
                data=data[study=='U.S. Study' & treatment<3], singular.ok = FALSE)

Rbeta.hat <- coef(fit.t1.t2)[-1]
RVR <- sandwich::vcovHC(fit.t1.t2, type <- 'HC0')[-1,-1]
W_obs <- as.numeric(Rbeta.hat %*% solve(RVR, Rbeta.hat)) # Wooldridge, equation (4.13)

set.seed(4321)
sims <- 10000
N <- nrow(data[study=='U.S. Study' & treatment<3])
N.treated <- data[study=='U.S. Study' & treatment<3, sum(t2)]
W_sims <- numeric(sims)

for(i in 1:sims){
  treatment.sim <- randomizr::complete_ra(N, N.treated)
  data_sim <- cbind(data[study=='U.S. Study' & treatment<3],treatment.sim)
  fit_sim <- lm(treatment.sim ~ male + white + black + asian + college +
                  latino + democrat +  strong_partisan +
                  work + w_partner + native_born +
                  att_mig + interest_politics, data=data_sim, singular.ok = FALSE)
  Rbeta.hat <- coef(fit_sim)[-1]
  RVR <- sandwich::vcovHC(fit_sim, type <- 'HC0')[-1,-1]
  W_sims[i] <- as.numeric(Rbeta.hat %*% solve(RVR, Rbeta.hat))
}

# p-value
p <- mean(W_sims >= W_obs)
# Balance table
summary_stats <- data[study=='U.S. Study' & treatment<3, .(lapply(.SD, mean, na.rm=TRUE)), by=t2,
                      .SDcols=c('male' , 'white' , 'black', 'asian', 'college' ,
                                'latino' , 'democrat' ,  'strong_partisan' ,
                                'work' , 'w_partner' , 'native_born' ,
                                'att_mig' , 'interest_politics')]

vars <- c('male' , 'white' , 'black', 'asian', 'college' ,
          'latino' , 'democrat' , 'strong_partisan' ,
          'work' , 'w_partner' , 'native_born' ,
          'att_mig' , 'interest_politics')
summary_stats <- cbind(summary_stats, rep(vars,2))
names(summary_stats) <- c('treatment', 'mean', 'var')
summary_stats$mean <-  unlist(summary_stats$mean)
summary_stats <- dcast(summary_stats[,.(var, treatment, mean)], var~treatment, value.var=c('mean'))

summary_stats <- summary_stats[vars, on="var"]

p_value <- c()
difference <- c()
for (i in vars) {
  vartest <- var.test(get(i) ~ t2, data = data[study=='U.S. Study' & treatment<3])
  if (vartest$p.value > 0.1) {
    ttest <- t.test(get(i) ~ t2, data = data[study=='U.S. Study' & treatment<3], var.equal = TRUE)
    
  } else {
    ttest <- t.test(get(i) ~ t2, data = data[study=='U.S. Study' & treatment<3])
  }
  difference <- c(difference, unname(ttest$estimate[2] - ttest$estimate[1]))
  p_value <- c(p_value, ttest$p.value)
}

summary_stats <- cbind(summary_stats, difference, p_value)
summary_stats <- rbind(summary_stats,
                       t(c('N', data[study=='U.S. Study' & t1==1,.N], data[study=='U.S. Study' & t2==1,.N], '', '')),
                       t(c('Wald statistic','', '','',p)),
                       use.names=FALSE)
summary_stats.t1.t2 <- summary_stats[,lapply(.SD, as.numeric),by=var]

# t1 vs t3
fit.t1.t3 <- lm(t3 ~ male + white + black + asian + college +
                  latino + democrat +  strong_partisan +
                  work + w_partner + native_born +
                  att_mig + interest_politics,
                data=data[study=='U.S. Study' & treatment!=2], singular.ok = FALSE)

Rbeta.hat <- coef(fit.t1.t3)[-1]
RVR <- sandwich::vcovHC(fit.t1.t3, type <- 'HC0')[-1,-1]
W_obs <- as.numeric(Rbeta.hat %*% solve(RVR, Rbeta.hat)) # Wooldridge, equation (4.13)

set.seed(4321)
sims <- 10000
N <- nrow(data[study=='U.S. Study' & treatment!=2])
N.treated <- data[study=='U.S. Study' & treatment!=2, sum(t3)]
W_sims <- numeric(sims)

for(i in 1:sims){
  treatment.sim <- randomizr::complete_ra(N, N.treated)
  data_sim <- cbind(data[study=='U.S. Study' & treatment!=2],treatment.sim)
  fit_sim <- lm(treatment.sim ~ male + white + black + asian + college +
                  latino + democrat +  strong_partisan +
                  work + w_partner + native_born +
                  att_mig + interest_politics, data=data_sim, singular.ok = FALSE)
  Rbeta.hat <- coef(fit_sim)[-1]
  RVR <- sandwich::vcovHC(fit_sim, type <- 'HC0')[-1,-1]
  W_sims[i] <- as.numeric(Rbeta.hat %*% solve(RVR, Rbeta.hat))
}

# p-value
p <- mean(W_sims >= W_obs)
# Balance table
summary_stats <- data[study=='U.S. Study' & treatment!=2, .(lapply(.SD, mean, na.rm=TRUE)), by=t3,
                      .SDcols=c('male' , 'white' , 'black', 'asian', 'college' ,
                                'latino' , 'democrat' , 'strong_partisan' ,
                                'work' , 'w_partner' , 'native_born' ,
                                'att_mig' , 'interest_politics')]

vars <- c('male' , 'white' , 'black', 'asian', 'college' ,
          'latino' , 'democrat' , 'strong_partisan' ,
          'work' , 'w_partner' , 'native_born' ,
          'att_mig' , 'interest_politics')
summary_stats <- cbind(summary_stats, rep(vars,2))
names(summary_stats) <- c('treatment', 'mean', 'var')
summary_stats$mean <-  unlist(summary_stats$mean)
summary_stats <- dcast(summary_stats[,.(var, treatment, mean)], var~treatment, value.var=c('mean'))

summary_stats <- summary_stats[vars, on="var"]

p_value <- c()
difference <- c()
for (i in vars) {
  vartest <- var.test(get(i) ~ t3, data = data[study=='U.S. Study' & treatment!=2])
  if (vartest$p.value > 0.1) {
    ttest <- t.test(get(i) ~ t3, data = data[study=='U.S. Study' & treatment!=2], var.equal = TRUE)
    
  } else {
    ttest <- t.test(get(i) ~ t3, data = data[study=='U.S. Study' & treatment!=2])
  }
  difference <- c(difference, unname(ttest$estimate[2] - ttest$estimate[1]))
  p_value <- c(p_value, ttest$p.value)
}

summary_stats <- cbind(summary_stats, difference, p_value)
summary_stats <- rbind(summary_stats,
                       t(c('N', data[study=='U.S. Study' & t1==1,.N], data[study=='U.S. Study' & t3==1,.N], '', '')),
                       t(c('Wald statistic','', '','',p)),
                       use.names=FALSE)
summary_stats.t1.t3 <- summary_stats[,lapply(.SD, as.numeric),by=var]
setnames(summary_stats.t1.t3, c('1', 'difference', 'p_value'), c('2', 'difference 2', 'p_value 2'))

summary_stats <- cbind(summary_stats.t1.t2, summary_stats.t1.t3[,c('var', '0'):=.(NULL, NULL)])
setcolorder(summary_stats, c('var', '0', '1', '2', 'difference', 'p_value', 'difference 2', 'p_value 2'))
summary_stats_study2 <- summary_stats

summary_stats <- rbind(summary_stats_study1, summary_stats_study2) 

summary_table <- print(xtable::xtable(summary_stats, caption="Balance of Individual Characteristics Across Treatment Conditions",
                                      digits=c(0,0,3,3,3,3,3,3,3),
                                      caption.placement='top', align='lcccccccc'),
                       file = './replication_hasz/output/tables/tabB2.tex',
                       include.rownames=FALSE,
                       caption.placement = "top")
